library(data.table)
library(ggplot2)
library(stringr)
library(readxl)
library(haven)
library(lfe)
library(RColorBrewer)

library(bigvis)


rm(list = ls())


setwd("~/Dropbox (Personal)/Trade shocks 1900s/Replication Archive")

# import trade data
imports_belgium <- read_excel("Data/Trade statistics/Statement of trade.xlsx",
    sheet = "Imports from Belgium") %>% data.table()
imports_germany <- read_excel("Data/Trade statistics/Statement of trade.xlsx",
    sheet = "Imports from Germany") %>% data.table()
imports_netherlands <- read_excel("Data/Trade statistics/Statement of trade.xlsx",
    sheet = "Imports from Netherlands") %>% data.table()

categories <- read_excel("Data/Trade statistics/Statement of trade.xlsx",
    sheet = "Categories")

categories_uk <- data.table(categories)[!is.na(category_uk),
    .(article, category_uk, drop)]
categories_agr <- data.table(categories)[!is.na(category_agr),
    .(article, category_agr)]
head(categories_uk)
head(categories_agr)
imports_uk <- rbindlist(list(imports_belgium, imports_germany, imports_netherlands),
    fill = T)
head(imports_uk)
head(imports_netherlands)
colnames(imports_uk)[1] <- "article"
imports_categories_uk <- merge(imports_uk, categories_uk, by = "article", all.x = T)
head(imports_categories_uk)
imports_categories_uk <- imports_categories_uk[!is.na(category_uk),
    .(value_1880 = sum(value_1880, na.rm = T),
    value_1885 = sum(value_1885_2, na.rm = T),
    value_1886 = sum(value_1886, na.rm = T),
    value_1890 = sum(value_1890, na.rm = T),
    value_1892 = sum(value_1892, na.rm = T),
    value_1895 = sum(as.numeric(value_1895), na.rm = T),
    value_1900 = sum(as.numeric(value_1900), na.rm = T),
    value_1906 = sum(as.numeric(value_1906), na.rm = T),
    value_1910 = sum(as.numeric(value_1910), na.rm = T)),
    by = .(category_uk, drop)]
head(imports_categories_uk)
# melting from wide to long format
imports_categories_uk <- melt(imports_categories_uk,
    id.vars = c("category_uk", "drop"))
head(imports_categories_uk)

imports_nas <- expand.grid(category_uk = "others", drop = 0,
    variable = unique(imports_categories_uk$variable), value = 0)
imports_categories_uk <- rbind(imports_categories_uk, imports_nas)

imports_categories_uk[, value_tot := sum(value), by = .(category_uk, variable)]




imports_categories_uk[, value_drop := sum(value * drop),
    by = .(category_uk, variable)]
imports_categories_uk[, variable := as.character(variable)]
imports_categories_uk[, len := nchar(variable)]
imports_categories_uk[, year := substr(variable, len - 3, len)]
imports_categories_uk[, year := as.numeric(year)]
imports_categories_uk <- unique(imports_categories_uk[,
    .(category_uk, year, value_tot, value_drop)])
imports_categories_uk[, value_consist := value_tot - value_drop]

exports_uk <- read_excel("Data/Trade statistics/Statement of trade.xlsx",
    sheet = "Exports to Germany") %>% data.table()
colnames(exports_uk)[1] <- "article"
categories_exports <- read_excel("Data/Trade statistics/Statement of trade.xlsx",
    sheet = "Categories exports") %>% data.table()
exports_uk <- merge(exports_uk, categories_exports, by = "article",
    all.x = T, all.y = T)
exports_uk[is.na(category_uk), article]
head(exports_uk)
exports_uk <- exports_uk[!is.na(category_uk),
    .(value_1885 = sum(value_1885, na.rm = T),
    #value_1886 = sum(value_1886, na.rm = T),
    value_1892 = sum(as.numeric(value_1892), na.rm = T),
    value_1895 = sum(as.numeric(value_1895), na.rm = T),
    value_1900 = sum(as.numeric(value_1900), na.rm = T),
    value_1906 = sum(as.numeric(value_1906), na.rm = T),
    value_1910 = sum(as.numeric(value_1910), na.rm = T)),
    by = .(category_uk)]

exports_uk <- melt(exports_uk,
    id.vars = c("category_uk"))
exports_uk[, variable := as.character(variable)]
exports_uk[, len := nchar(variable)]
exports_uk[, year := substr(variable, len - 3, len)]
exports_uk[, year := as.numeric(year)]
exports_uk <- exports_uk[, .(category_uk, year, exports = value)]

imports_categories_uk <- merge(imports_categories_uk, exports_uk, all.x = T,
    by = c("category_uk", "year"))
imports_categories_uk[is.na(exports), exports := 0]
head(imports_categories_uk)
imports_categories_uk[, net_imports := value_tot - exports]
imports_categories_uk[year != 1890, value_lag := shift(value_tot, 1),
    by = .(category_uk)]

imports_us_base <- read_excel("Data/Trade statistics/Statement of trade.xlsx",
    sheet = "Imports from US") %>% data.table()
colnames(imports_us_base)[1] <- "article"

imports_us <- merge(imports_us_base, categories_uk, by = "article",
    all.x = T, all.y = T)
imports_us[is.na(category_uk), article]
head(imports_us)
imports_us <- imports_us[!is.na(category_uk),
    .(value_1885 = sum(value_1885, na.rm = T),
    value_1886 = sum(value_1886, na.rm = T),
    value_1892 = sum(value_1892, na.rm = T),
    value_1895 = sum(value_1895, na.rm = T),
    value_1900 = sum(value_1900, na.rm = T),
    value_1906 = sum(value_1906, na.rm = T),
    value_1910 = sum(value_1910, na.rm = T)),
    by = .(category_uk)]

imports_us <- melt(imports_us,
    id.vars = c("category_uk"))
imports_us[, variable := as.character(variable)]
imports_us[, len := nchar(variable)]
imports_us[, year := substr(variable, len - 3, len)]
imports_us[, year := as.numeric(year)]
imports_us <- imports_us[, .(category_uk, year, imports_us = value)]

imports_categories_uk <- merge(imports_categories_uk, imports_us, all.x = T,
    by = c("category_uk", "year"))
imports_categories_uk[is.na(imports_us), imports_us := 0]
imports_categories_uk[, imports_both := value_tot + imports_us]
head(imports_categories_uk)

head(imports_categories_uk)
fwrite(imports_categories_uk, "Data/Intermediate outputs/import_year_data.csv")

imports_categories_agr <- merge(imports_uk, categories_agr, by = "article",
    all.x = T)
head(imports_categories_agr)
imports_categories_agr <- imports_categories_agr[!is.na(category_agr),
    .(value_1885 = sum(value_1885_2, na.rm = T),
    value_1886 = sum(value_1886, na.rm = T),
    value_1892 = sum(value_1892, na.rm = T),
    value_1895 = sum(as.numeric(value_1895), na.rm = T),
    value_1900 = sum(as.numeric(value_1900), na.rm = T),
    value_1906 = sum(as.numeric(value_1906), na.rm = T),
    value_1910 = sum(as.numeric(value_1910), na.rm = T)),
    by = .(category_agr)]

imports_categories_agr <- melt(imports_categories_agr,
    id.vars = c("category_agr"))

imports_categories_agr[, variable := as.character(variable)]
imports_categories_agr[, len := nchar(variable)]
imports_categories_agr[, year := substr(variable, len - 3, len)]
imports_categories_agr[, year := as.numeric(year)]
imports_categories_agr <- imports_categories_agr[,
    .(category_agr, year, value_agr = value)]

imports_us_agr <- merge(imports_us_base, categories_agr, by = "article",
    all.x = T)
head(imports_us_agr)
imports_us_agr <- imports_us_agr[!is.na(category_agr),
    .(value_1885 = sum(value_1885, na.rm = T),
    value_1886 = sum(value_1886, na.rm = T),
    value_1892 = sum(value_1892, na.rm = T),
    value_1895 = sum(value_1895, na.rm = T),
    value_1900 = sum(value_1900, na.rm = T),
    value_1906 = sum(value_1906, na.rm = T),
    value_1910 = sum(value_1910, na.rm = T)),
    by = .(category_agr)]

imports_us_agr <- melt(imports_us_agr,
    id.vars = c("category_agr"))

imports_us_agr[, variable := as.character(variable)]
imports_us_agr[, len := nchar(variable)]
imports_us_agr[, year := substr(variable, len - 3, len)]
imports_us_agr[, year := as.numeric(year)]
imports_us_agr <- imports_us_agr[,
    .(category_agr, year, imports_us_agr = value)]

imports_categories_agr <- merge(imports_categories_agr, imports_us_agr,
    by = c("category_agr", "year"), all.x = T, all.y = T)
imports_categories_agr[is.na(imports_us_agr), imports_us_agr := 0]
head(imports_categories_agr)
imports_categories_agr[, imports_both_agr := value_agr + imports_us_agr]


# read 1881 census: change filepath to location of 1881 census data
census_1881 <- fread("Data/Census/Census 1881/ipumsi_00025.csv")
colnames(census_1881) <- tolower(colnames(census_1881))
# count employed population by occupation by parish
colnames(census_1881)


census_1881 <- census_1881[labforce != 1, .(n = .N,
    n_foreign = sum(nativity == 2),
    n_irish = sum(bplcountry == 42060)
    ),
    by = .(countyuk, parishuk_1881, occicem, occhisco)]

# crosswalks from parish to constituency
parish_const <- fread(
    "Data/Parish constituency crosswalks/jusko_parishuk_1881_clean.csv")
census_uk <- merge(census_1881, parish_const, by = "parishuk_1881",
    allow.cartesian = T)



# aggregate occupation count by constituency
census_uk <- census_uk[, .(n = sum(n), n_foreign = sum(n_foreign),
    n_irish = sum(n_irish)),
    by = .(g_unit, constituency_id, constituency_name, county_name, const_type,
        occicem, occhisco)]


census_uk[, `:=`(const_pop = sum(n),
    const_n_foreign = sum(n_foreign),
    const_n_irish = sum(n_irish)),
    by = .(constituency_id)]
census_uk[, `:=`(const_frac_foreign = const_n_foreign / const_pop,
    const_frac_irish = const_n_irish / const_pop)]

# classify occupations as primary, secondary, tertiary
pst <- read_excel("Data/PST/I-CeMOccode_1881EW_51EA_17EA_PST_SIC Final.xlsx")



pst <- data.table(pst)[, .(occicem = OCCODE, pst_sector = PSTsector,
    cat17 = `17cat`, cat51 = `51cat`, ea51 = `51EA`)]


census_uk <- merge(census_uk, pst, all.x = T, by = "occicem")
census_uk[, `:=`(
    secondary = sum( (pst_sector == "SECONDARY") * n, na.rm = T)),
    by = .(constituency_id)]
# share of secondary employment by constituency
census_uk[, `:=`(const_frac_secondary = secondary / const_pop)]

# now link occicem to categories
occicem_categories <- read_excel("Data/Trade statistics/occicem_categories.xlsx",
    sheet = "icem to categories")
head(occicem_categories)
occicem_categories_uk <- data.table(occicem_categories)[,
    .(occicem = OCCODE, category_uk)]

census_uk <- merge(census_uk, occicem_categories_uk, by = "occicem",
    all.x = T)

census_uk <- census_uk[, .(n = sum(n)),
    by = .(g_unit, constituency_id, constituency_name, county_name, const_type,
        const_pop, const_frac_secondary, const_frac_foreign,
        const_frac_irish, category_uk)]

constituency_cross_section <- unique(census_uk[, .(constituency_id, constituency_name,
    county_name, g_unit, const_type, const_pop,
    const_frac_secondary, const_frac_foreign, const_frac_irish)])

constituency_details <- unique(census_uk[, .(constituency_id, constituency_name,
    g_unit, const_pop)])
constituency_category_counts <- census_uk[, .(constituency_id, category_uk, n)]

shares_uk <- expand.grid(constituency_id = unique(census_uk[, constituency_id]),
    category_uk = unique(census_uk[, category_uk])) %>% data.table()
shares_uk <- merge(shares_uk, constituency_details, by = "constituency_id",
    all.x = T)
shares_uk <- merge(shares_uk, constituency_category_counts,
    by = c("constituency_id", "category_uk"), all.x = T)
# code empty shares as zero
shares_uk[is.na(n), n := 0]
shares_uk[, nat_category_n := sum(n), by = .(category_uk)]
shares_uk[, const_frac_category := n / const_pop]
shares_uk[is.na(category_uk), category_uk := "others"]


head(imports_categories_uk)

## separate route for agricultural aggregate

head(occicem_categories)
occicem_categories_agr <- data.table(occicem_categories)[,
    .(occicem = OCCODE, category_agr)]
unique(occicem_categories_agr$category_agr)
census_agr <- merge(census_1881, occicem_categories_agr, by = "occicem",
    all.x = T)
head(census_agr)
unique(census_agr[, .(category_agr)])
census_agr[, category_old := category_agr]
census_agr[, category_agr := NULL]

county_agr <- read_excel("Data/Agriculture/Returns of agriculture 1885.xlsx",
    sheet = "results by county 1885", skip = 79) %>% data.table()

county_agr <- melt(county_agr, id.vars = "category_agr")

county_agr <- county_agr[, .(category_old = "agriculture", category_agr,
    countyuk = as.numeric(as.character(variable)),
    category_weight = value)]
census_agr <- merge(census_agr, county_agr, by = c("countyuk", "category_old"),
    all.x = T, allow.cartesian = T)
census_agr[!is.na(category_old) & is.na(category_agr), category_agr := category_old]

census_agr[is.na(category_weight), category_weight := 1 ]
census_agr[, n_new := n * category_weight ]
# for london and channel islands no information on agriculture, so code ag to NA
census_agr[category_agr == "agriculture", category_agr := NA]

census_agr <- census_agr[, .(n = sum(n_new)),
    by = .(parishuk_1881, category_agr)]

census_agr <- merge(census_agr, parish_const, by = "parishuk_1881",
    allow.cartesian = T)

# aggregate occupation count by constituency
census_agr <- census_agr[, .(n = sum(n)),
    by = .(g_unit, constituency_id, constituency_name, county_name, const_type,
        category_agr)]

shares_agr <- expand.grid(constituency_id = unique(census_agr[, constituency_id]),
    category_agr = unique(census_agr[, category_agr])) %>% data.table()
head(shares_agr)
shares_agr <- merge(shares_agr, constituency_details, by = "constituency_id",
    all.x = T)
shares_agr <- merge(shares_agr,
    unique(census_agr[, .(constituency_id, category_agr, n)]),
    by = c("constituency_id", "category_agr"), all.x = T)

shares_agr[is.na(n), n := 0]
shares_agr[, nat_category_agr_n := sum(n), by = .(category_agr)]
shares_agr[, const_frac_category_agr := n / const_pop]

# now link shares to shocks
shocks_uk <- merge(shares_uk[!is.na(category_uk)],
    imports_categories_uk, all.x = T, all.y = T,
    by = "category_uk", allow.cartesian = T)
head(shocks_uk)
head(imports_categories_uk)
values_uk_1885 <- imports_categories_uk[year == 1885,
    .(category_uk, value_consist_85 = value_consist, value_tot_85 = value_tot,
        net_imports_85 = net_imports, imports_us_85 = imports_us,
        imports_both_85 = imports_both,
        exports_85 = exports)]
shocks_uk <- merge(shocks_uk, values_uk_1885, by = "category_uk", all.x = T)
values_uk_1880 <- imports_categories_uk[year == 1880,
    .(category_uk, value_tot_80 = value_tot)]
shocks_uk <- merge(shocks_uk, values_uk_1880, by = "category_uk", all.x = T)

values_uk_1900 <- imports_categories_uk[year == 1900,
    .(category_uk, value_consist_00 = value_consist, value_tot_00 = value_tot,
        net_imports_00 = net_imports, exports_00 = exports)]

shocks_uk <- merge(shocks_uk, values_uk_1900, by = "category_uk", all.x = T)


shocks_uk[, `:=`(diff_percap = (value_tot - value_tot_85) / nat_category_n,
    fd_percap = (value_tot - value_lag) / nat_category_n,
    import_percap = value_tot / nat_category_n,
    diff_percap00 = (value_tot - value_tot_00) / nat_category_n,
    export_diff_percap = (exports - exports_85) / nat_category_n,
    export_diff_percap00 = (exports - exports_00) / nat_category_n
    )]

shocks_uk <- shocks_uk[category_uk != "asbestos"]
shocks_uk[is.na(diff_percap), category_uk] %>% unique()

quantile(shocks_uk$diff_percap, c(0.9, 0.95,  0.98, 0.99))
quantile(shocks_uk$diff_percap, c(0.9, 0.95, 0.975, 0.98, 0.99))
mean(shocks_uk$diff_percap < -500)
mean(shocks_uk$diff_percap < 500)


winsorize <- function(vec, cap){
    new_vec <- vec
    new_vec[vec < -cap] <- -cap
    new_vec[vec > cap] <- cap
    return(new_vec)
}

shocks_uk[, `:=`(diff_percap_w = winsorize(diff_percap, 500),
    diff_percap00w = winsorize(diff_percap00, 500),
    fd_percap_w = winsorize(fd_percap, 500),
    export_diff_percap_w = winsorize(export_diff_percap, 500),
    export_diff_percap00w = winsorize(export_diff_percap00, 500))]



quantile(shocks_uk[, diff_percap00], c(0.01, 0.02, 0.98, 0.99))

shocks_uk[, `:=`(shock_diff = sum(diff_percap * const_frac_category),
    fd_shock = sum(fd_percap * const_frac_category),
    fd_shock_w = sum(fd_percap_w * const_frac_category),
    shock_diff_w = sum(diff_percap_w * const_frac_category),
    shock_diff00 = sum(diff_percap00 * const_frac_category),
    shock_diff00w = sum(diff_percap00w * const_frac_category),
    export_shock = sum(export_diff_percap * const_frac_category),
    export_shock00 = sum(export_diff_percap00 * const_frac_category),
    export_shock_w = sum(export_diff_percap_w * const_frac_category),
    export_shock00w = sum(export_diff_percap00w * const_frac_category)),
    by = .(constituency_id, year)]

# long first difference shocks
shocks_uk[year %in% c(1880, 1890, 1900, 1910),
    diff_percap_lag_census := shift(diff_percap, 1),
    by = .(constituency_id, category_uk)]
shocks_uk[year %in% c(1890, 1900, 1910),
    fd_percap_census := diff_percap - diff_percap_lag_census]
shocks_uk[year %in% c(1890, 1900, 1910),
    fd_percap_census_w := winsorize(fd_percap_census, 500)]
shocks_uk[year %in% c(1890, 1900, 1910),
    `:=`(fd_shock_census = sum(fd_percap_census * const_frac_category),
    fd_shock_census_w = sum(fd_percap_census_w * const_frac_category)),
    by = .(constituency_id, year)]


shocks_uk[year %in% c(1885, 1892, 1900, 1910),
    diff_percap_lag_elec10 := shift(diff_percap, 1),
    by = .(constituency_id, category_uk)]
shocks_uk[year %in% c(1892, 1900, 1910),
    fd_percap_elec10 := diff_percap - diff_percap_lag_elec10]
shocks_uk[year %in% c(1892, 1900, 1910),
    fd_percap_elec10_w := winsorize(fd_percap_elec10, 500)]
shocks_uk[year %in% c(1892, 1900, 1910),
    `:=`(fd_shock_elec10 = sum(fd_percap_elec10 * const_frac_category),
    fd_shock_elec10_w = sum(fd_percap_elec10_w * const_frac_category)),
    by = .(constituency_id, year)]

# add shock for December 1910 election
shocks_uk_1911 <- shocks_uk[year == 1910]
shocks_uk_1911[, year := 1911]
shocks_uk <- rbind(shocks_uk, shocks_uk_1911)
rm(shocks_uk_1911)

# agricultural shock data
shocks_agr <- merge(shares_agr[!is.na(category_agr)],
    imports_categories_agr, all.x = T, all.y = T,
    by = "category_agr", allow.cartesian = T)
colnames(shocks_agr)
values_agr_1885 <- imports_categories_agr[year == 1885,
    .(category_agr, value_agr_85 = value_agr,
        imports_us_agr_85 = imports_us_agr,
        imports_both_agr_85 = imports_both_agr)]
shocks_agr <- merge(shocks_agr, values_agr_1885, by = "category_agr", all.x = T)
shocks_agr[, `:=`(
    diff_us_percap = (imports_us_agr - imports_us_agr_85) / nat_category_agr_n)]

shocks_agr_1911 <- shocks_agr[year == 1910]
shocks_agr_1911[, year := 1911]
shocks_agr <- rbind(shocks_agr, shocks_agr_1911)
rm(shocks_agr_1911)


shocks_industry_clean <- shocks_uk[, .(constituency_id, g_unit, constituency_name,
    year, category_uk, const_frac_category, diff_percap,
    diff_percap_w, diff_percap00w, fd_percap_w,
    fd_percap_census_w,
    fd_percap_elec10_w,
    export_diff_percap_w, export_diff_percap00w)]
constituency_year <- unique(shocks_uk[, .(constituency_id, g_unit,
    constituency_name, year, shock_diff, shock_diff_w,
    shock_diff00w, export_shock_w,
    export_shock00w,
    fd_shock_w, fd_shock_census_w, fd_shock_elec10_w)])
fwrite(shocks_industry_clean, "Data/Intermediate outputs/shocks_and_shares.csv")
rm(shocks_industry_clean)

# disregard: code for checking identical to original analysis dataset
#df_temp1 <- fread("~/Dropbox (Personal)/Trade shocks 1900s/Data/#Clean for analysis/shocks_and_shares.csv")
#df_temp2 <- fread("Data/Intermediate outputs/shocks_and_shares.csv")
#head(df_temp2)
#head(df_temp1)
#sub_cols <- colnames(df_temp2)
#df_temp3 <- df_temp1[, ..sub_cols]
#all.equal(df_temp3, df_temp2)




# Electoral data
es_elections <- fread("Data/Eggers Spirling/elections.csv")


es_returns <- fread("Data/Eggers Spirling/election_returns.csv")

es_elections <- merge(es_elections, es_returns, by = "election_id")
es_elections[, year := as.numeric(substr(date, 1, 4))]
es_elections[, month := as.numeric(substr(date, 6, 7))]
# recode december 1910 election as 1911
es_elections[year == 1910 & month == 12, year := 1911]
es_elections[, rank := rank(-votes), by = .(election_id)]
es_elections[seats_up == 4, top_candidates := 1 * (rank <= 8)]
es_elections[seats_up == 3, top_candidates := 1 * (rank <= 6)]
es_elections[seats_up == 2, top_candidates := 1 * (rank <= 4)]
es_elections[seats_up == 1, top_candidates := 1 * (rank <= 2)]
unique(es_elections[, seats_up])




es_elections <- es_elections[between(year, 1880, 1913)]
winners_list <- es_elections[winner == 1,
    .(last_election = election_id, member_id, party)]

election_list <- es_elections[, .(constituency.id, date, election_id)] %>%
    unique()
setorder(election_list, date)
election_list[, last_election := shift(election_id, 1), by = .(constituency.id)]
election_list <- merge(election_list, winners_list, by = "last_election")
head(election_list)
election_list <- election_list[,
    .(election_id, incumbent_member_id = member_id, incumbent_party = party)]
incumbents <- merge(es_elections, election_list, by = "election_id")
incumbents[, `:=`(incumbent_mp = as.numeric(member_id == incumbent_member_id),
    incumbent_party = as.numeric(party == incumbent_party))]

incumbents <- incumbents[, .(incumbent_mp = max(incumbent_mp, na.rm = T),
    incumbent_party = max(incumbent_party, na.rm = T)),
    by = .(election_id, name_on_return)]
incumbents[incumbent_mp == -Inf, incumbent_mp := 0]
head(incumbents)
es_elections <- merge(es_elections, incumbents,
    by = c("election_id", "name_on_return"), all.x = T)

df_services <- fread("Data/Eggers Spirling/services.csv")

member_elections <- es_elections[, .(election_id, date,
    constituency_id = constituency.id, member_id)]
member_elections <- merge(member_elections[!is.na(member_id)], df_services,
    by = c("member_id", "constituency_id"))
head(member_elections)
member_elections[, incumbent_mp2 := 1 * (date > start_date & date <= end_date)]
member_elections[incumbent_mp2 == 1]
member_elections <- member_elections[incumbent_mp2 == 1,
    .(member_id, election_id, incumbent_mp2)]
es_elections <- merge(es_elections, member_elections,
    by = c("member_id", "election_id"), all.x = T)
es_elections[is.na(incumbent_mp2), incumbent_mp2 := 0]

es_elections <- es_elections[between(year, 1885, 1914)]
print(es_elections[, .(n = .N), by = .(party)], nrow = 69)
conservatives <- c("C", "LU", "Ind C", "Ind C (C)", "C*", "C (Nat P)",
    "LU (Ind L)", "C (Ind C)", "LU*", "C (L)", "LU (C)", "C* (Nat P)",
    "Ind LU", "LU (L)", "Ind C (L)", 'LU (Nat P) (C)', 'C (Nat P) (C)',
    'LU/Crf', 'Ind. C', 'Ind. LU')
leftist <- c('L/Lab', 'ILP', 'Lab', 'Ind Lab', 'SDF', 'SDP', 'Lab (L/Lab)',
    'Ind L/Lab', 'Ind Lab (L/Lab)', 'Ind Lab (ILP)', 'Ind Lab (Lab)', 'Ind. Lab',
    "SWRC", "SPLP", "SLP", "SPP", "SSF & SUTCLP", "SLRL", "BSP"
    )
labour <- c('L/Lab', 'Lab', 'Lab (L/Lab)', "L/Lab (Lab)")
labour_party <- c('Lab', 'Lab (L/Lab)')
liberals <- c("L", "L/Lab", "L (LU)", "Ind L", "Ind. L", "L(LU)", "L (Ind L)",
    "Ind L/Lab", "Ind L/Crf", "L/Crf", "Ind L (L)", "Ind L/Crf (LU)",
    "L/Lab (Lab)", "Ind L (LU)")

es_elections[, `:=`(conservative = 1 * party %in% conservatives,
    left = 1 * party %in% leftist,
    lab = 1 * party %in% labour,
    lab_p = 1 * party %in% labour_party,
    liberal = 1 * party %in% liberals)]

es_elections[is.na(unopposed), unopposed := 0]


es_elections_ag <- es_elections[ by_election == 0,
    .(total_votes = sum(votes, na.rm = T),
    incumbent_mp_votes = sum(votes * incumbent_mp, na.rm = T),
    #incumbent_mp2_votes = sum(votes * incumbent_mp2, na.rm = T),
    incumbent_party_votes = sum(votes * incumbent_party, na.rm = T),
    incumbent_mp_present = sum(incumbent_mp, na.rm = T),
    cons_votes = sum(votes * conservative, na.rm = T),
    lab_votes = sum(votes * lab, na.rm = T),
    lab_p_votes = sum(votes * lab_p, na.rm = T),
    lib_votes = sum(votes * liberal, na.rm = T),
    unopposed = as.numeric( sum(unopposed, na.rm = T) > 0)),
    by = .(election_id, electors, constituency_id = constituency.id, year)]
head(es_elections_ag)




es_elections_ag[, `:=`(cons_share = cons_votes / total_votes,
    lab_share = lab_votes / total_votes,
    lab_p_share = lab_p_votes / total_votes,
    lib_share = lib_votes / total_votes,
    incumbent_mp_share = incumbent_mp_votes / total_votes,
    #incumbent_mp2_share = incumbent_mp2_votes / total_votes,
    incumbent_party_share = incumbent_party_votes / total_votes,
    lab_present = 1 * (lab_votes > 0))]
es_elections_ag <- es_elections_ag[, .(election_id, constituency_id,
    year, electors, lab_present, lab_share, lab_p_share, lib_share, cons_share,
    total_votes, unopposed, incumbent_mp_share, incumbent_party_share)]

print(colnames(es_elections_ag))
constituency_year <- merge(constituency_year, es_elections_ag,
    by = c("constituency_id", "year"), all.x = T, all.y = T)
constituency_year[year %in% c(1885, 1892, 1900, 1910),
    cons_share_elec10_lag := shift(cons_share, 1),
    by = .(constituency_id)]
constituency_year[, cons_share_1900 := cons_share[year == 1900],
    by = .(constituency_id)]
constituency_year[year == 1911, cons_share_elec10_lag := cons_share_1900]
constituency_year[year %in% c(1892, 1900, 1910, 1911),
    fd_cons_elec10 := cons_share - cons_share_elec10_lag]



constituency_year[, `:=`(lab_share_lag = shift(lab_share, 1),
    lib_share_lag = shift(lib_share, 1),
    cons_share_lag = shift(cons_share, 1),
    unopposed_lag = shift(unopposed)),
    by = .(constituency_id)]
constituency_year[, `:=`(fd_lab = lab_share - lab_share_lag,
    fd_lib = lib_share - lib_share_lag,
    fd_cons = cons_share - cons_share_lag)]
constituency_year[, fd_incum := fd_cons]
constituency_year[year %in% c(1886, 1910, 1911), fd_incum := fd_lib]



# US wheat shock
head(shocks_agr)
wheat_shares <- unique(shocks_agr[category_agr == "wheat",
    .(constituency_id, const_frac_wheat = const_frac_category_agr)])
constituency_cross_section <- merge(constituency_cross_section,
    wheat_shares, by = "constituency_id", all.x = T)

wheat_shocks <- shocks_agr[category_agr == "wheat" & !is.na(constituency_id),
    .(constituency_id, year, #wheat_us_diff = diff_us_percap,
        wheat_us_shock = diff_us_percap * const_frac_category_agr)]
head(wheat_shocks)
constituency_year <- merge(constituency_year, wheat_shocks,
    by = c("constituency_id", "year"), all.x = T)



pst <- read_excel("Data/PST/I-CeMOccode_1881EW_51EA_17EA_PST_SIC Final.xlsx")



pst <- data.table(pst)[, .(occicem = OCCODE, pst_sector = PSTsector, cat17 = `17cat`,
    cat51 = `51cat`, ea51 = `51EA`)]


df_class <- read_excel("Data/Class categories/occicem_class.xlsx")
head(df_class)

hiscam <- fread("Data/HISCAM/hiscam_gb.csv")
head(hiscam)
colnames(hiscam) <- c("occhisco", "hiscam")



prep_census <- function(census_file, crosswalk_file, par_var, census_year_no){

    df <- fread(census_file)
    colnames(df) <- tolower(colnames(df))
    colnames(df)[which(colnames(df) == par_var)] <- "parishuk"



    par_const <- fread(crosswalk_file)
    df <- df[,
        .(n = .N),
        by = .(parishuk, occicem, occhisco)]
    df <- merge(df, par_const, by = "parishuk",
        all.x = T, all.y = T, allow.cartesian = T)
    df <- merge(df, pst, all.x = T, by = "occicem")
    df <- merge(df, df_class, all.x = T, by = "occicem")
    df <- merge(df, hiscam, by = "occhisco", all.x = T)
    df[, n_w := n * jd_weight2]
    head(df)


    df <- df[!is.na(constituency_id),
        .(
        n_secondary = sum( (pst_sector == "SECONDARY") * n_w, na.rm = T),
        n = sum(n_w, na.rm = T),
        n_unskilled = sum(n_w * (rg_class_1913 == 5), na.rm = T),
        n_vagrant = sum(n_w * (occicem == 797), na.rm = T),
        hiscam = weighted.mean(hiscam, n_w, na.rm = T)),
        by = .(g_unit, constituency_id, constituency_name, county_name,
            const_type)]
    head(df)


    df <- df[, .(g_unit, constituency_id, constituency_name,
        county_name, const_type,
        census_year = census_year_no,
        pop = n,
        frac_secondary = n_secondary / n,
        frac_unskilled = n_unskilled / n,
        frac_vagrant = n_vagrant / n,
        hiscam)]
    return(df)
}




# process data on economic outcomes from census microdata. Change census_file
# arguments to locations of census data.
dem_1881 <- prep_census(census_file = "Data/Census/Census 1881/ipumsi_00025.csv",
    crosswalk_file = "Data/Parish constituency crosswalks/TS_parish_const_1881_weighted_07_19.csv",
    par_var = "parishuk_1881",
    census_year_no = 1881)
dem_1891 <- prep_census(census_file = "Data/Census/Census 1891/ipumsi_00017.csv",
    crosswalk_file = "Data/Parish constituency crosswalks/TS_parish_const_1891_weighted_07_19.csv",
    par_var = "parishuk_1891",
    census_year_no = 1891)
dem_1901 <- prep_census(census_file = "Data/Census/Census 1901/ipumsi_00023.csv",
    crosswalk_file = "Data/Parish constituency crosswalks/TS_parish_const_1901_weighted_07_19.csv",
    par_var = "parishuk_1901",
    census_year_no = 1901)
dem_1911 <- prep_census(census_file = "Data/Census/Census 1911/ipumsi_00018.csv",
    crosswalk_file = "Data/Parish constituency crosswalks/TS_parish_const_1911_weighted_07_19.csv",
    par_var = "parishuk_1911",
    census_year_no = 1911)
dem <- rbindlist(list(dem_1881, dem_1891, dem_1901, dem_1911))
head(dem)

constituency_year[, census_year := floor(year / 10) * 10 + 1]
constituency_year <- constituency_year[!is.na(constituency_id)]
constituency_year2 <- merge(constituency_year, dem,
    by = c("constituency_id", "g_unit", "constituency_name",  "census_year"),
    all.x = T)
fwrite(constituency_year2, "Data/Intermediate outputs/constituency_panel.csv")

# disregard: check that results match original analysis datasets
#df_temp1 <- fread("~/Dropbox (Personal)/Trade shocks 1900s/Data/#Clean for analysis/constituency_panel.csv")
#df_temp2 <- fread("Data/Intermediate outputs/constituency_panel.csv")


#short_cols <- colnames(df_temp2)
#df_temp3 <- df_temp1[, ..short_cols]

#all.equal(df_temp2, df_temp3)

# create groups to aggregate constituencies in cities

manchester <- c("Manchester East", "Manchester North", "Manchester North West",
    "Manchester South", "Manchester South West", "Salford North",
    "Salford South", "Salford West")
london <- c("Battersea and Clapham, Battersea",
    "Battersea and Clapham, Clapham", "Bethnal Green North East",
    "Bethnal Green South West", "Southwark West", "Southwark Rotherhithe",
    "Camberwell Dulwich", "Camberwell North", "Camberwell Peckham",
    "Chelsea", "City of London", "Ealing", "Finsbury Central", "Finsbury East",
    "Finsbury Holborn", "Hackney Central", "Hackney North", "Hackney South",
    "Hammersmith", "Hampstead", "Islington East", "Islington West",
    "Kensington South", "Lambeth Brixton", "Lambeth Kennington",
    "Lambeth North", "Lambeth Norwood", "Marylebone East", "Marylebone West",
    "Paddington North",  "Paddington South", "Shoreditch Haggerston",
    "Shoreditch Hoxton", "Southwark Bermondsey", "Southwark Rotherhithe",
    "Southwark West", "St George's Hanover Square", "St Pancras East",
    "St Pancras North", "St Pancras South","St Pancras West",
    "Tottenham", "Tower Hamlets Bow and Bromley", "Tower Hamlets Limehouse",
    "Tower Hamlets Mile End", "Tower Hamlets Poplar", "Tower Hamlets St George",
    "Tower Hamlets Stepney", "Tower Hamlets Whitechapel", "Walthamstow",
    "Wandsworth", "West Ham North", "West Ham South", "Westminster",
    "Wimbledon", "Woolwich", "Strand")
wolverhampton <- c("Wolverhampton East", "Wolverhampton South",
    "Wolverhampton West")
sheffield <- c("Sheffield Attercliffe", "Sheffield Brightside",
    "Sheffield Central", "Sheffield Ecclesall", "Sheffield Hallam")
leeds <- c("Leeds Central", "Leeds East", "Leeds North", "Leeds South",
    "Leeds West")
birmingham <- c("Birmingham Bordesley", "Birmingham Central", "Birmingham East",
    "Birmingham Edgbaston", "Birmingham North", "Birmingham South",
    "Birmingham West" )
hull <- c("Kingston upon Hull Central", "Kingston upon Hull East",
    "Kingston upon Hull West")
liverpool <- c("Bootle", "Liverpool Abercromby", "Liverpool East Toxteth",
    "Liverpool Everton", "Liverpool Exchange", "Liverpool Kirkdale",
    "Liverpool Scotland", "Liverpool Walton", "Liverpool West Derby",
    "Liverpool West Toxteth")
bradford <- c("Bradford Central", "Bradford East", "Bradford West" )
bristol <- c("Bristol East", "Bristol North", "Bristol South", "Bristol West")
swansea <- c("Swansea District of Boroughs", "Swansea Town")
nottingham <- c("Nottingham East", "Nottingham South", "Nottingham West")

constituency_cross_section[, constituency_group := constituency_name]
constituency_cross_section[constituency_name %in% manchester,
    constituency_group := "Manchester"]
constituency_cross_section[constituency_name %in% london,
    constituency_group := "London"]
constituency_cross_section[constituency_name %in% wolverhampton,
    constituency_group := "Wolverhampton"]
constituency_cross_section[constituency_name %in% sheffield,
    constituency_group := "Sheffield"]
constituency_cross_section[constituency_name %in% leeds,
    constituency_group := "Leeds"]
constituency_cross_section[constituency_name %in% birmingham,
    constituency_group := "Birmingham"]
constituency_cross_section[constituency_name %in% hull,
    constituency_group := "Hull"]
constituency_cross_section[constituency_name %in% liverpool,
    constituency_group := "Liverpool"]
constituency_cross_section[constituency_name %in% bradford,
    constituency_group := "Bradford"]
constituency_cross_section[constituency_name %in% bristol,
    constituency_group := "Bristol"]
constituency_cross_section[constituency_name %in% swansea,
    constituency_group := "Swansea"]
constituency_cross_section[constituency_name %in% nottingham,
    constituency_group := "Nottingham"]

south_east <- c("Kent", "Surrey", "Sussex", "Hampshire", "Berkshire",
    "Middlesex", "Isle of Wight")
south_midland <- c("Hertfordshire", "Buckinghamshire", "Oxfordshire",
    "Northamptonshire", "Huntingdonshire", "Bedfordshire", "Cambridgeshire")
eastern <- c("Essex", "Suffolk", "Norfolk")
south_western <- c("Wiltshire", "Dorset", "Devon", "Cornwall", "Somerset")
west_midland <- c("Gloucestershire", "Herefordshire", "Shropshire",
    "Staffordshire", "Worcestershire", "Warwickshire")
north_midland <- c("Leicestershire", "Rutland", "Lincolnshire",
    "Nottinghamshire", "Derbyshire")
north_western <- c("Cheshire", "Lancashire")
yorkshire <- c("Yorkshire East Riding", "Yorkshire North Riding",
    "Yorkshire West Riding")
northern <- c("Durham", "Northumberland", "Westmorland", "Cumberland")
wales <- "Wales"
constituency_cross_section[county_name %in% south_east, region := "South East"]
constituency_cross_section[county_name %in% south_midland, region := "South Midland"]
constituency_cross_section[county_name %in% eastern, region := "Eastern"]
constituency_cross_section[county_name %in% south_western, region := "South Western"]
constituency_cross_section[county_name %in% west_midland, region := "West Midland"]
constituency_cross_section[county_name %in% north_midland, region := "North Midland"]
constituency_cross_section[county_name %in% north_western, region := "North Western"]
constituency_cross_section[county_name %in% yorkshire, region := "Yorkshire"]
constituency_cross_section[county_name %in% northern, region := "Northern"]
constituency_cross_section[county_name %in% wales, region := "Wales"]
constituency_cross_section[constituency_group == "London", region := "London"]


constituency_cross_section[, webb_county := county_name]
constituency_cross_section[constituency_group == "London", webb_county := "London"]
constituency_cross_section[county_name == "Middlesex", webb_county := "London"]
constituency_cross_section[webb_county== "Surrey", constituency_name]
constituency_cross_section[constituency_name %in% c("Croydon", "Kingston"),
    webb_county := "London"]
constituency_cross_section[county_name == "Wales", constituency_name]
south_wales <- c('Swansea District of Boroughs', 'Cardiff District of Boroughs',
    'Monmouth District of Boroughs', 'Breconshire', 'Carmarthenshire Western',
    'Cardiganshire', 'Merthyr Tydfil', 'Glamorganshire Eastern',
    'Monmouthshire Northern', 'Monmouthshire Southern', 'Pembrokeshire',
    'Carmarthen District of Boroughs', 'Carmarthenshire Eastern', 'Gower',
    'Rhondda', 'Glamorganshire Southern', 'Monmouthshire Western',
    'Swansea Town', 'Glamorganshire Mid',
    'Pembroke and Haverfordwest District of Boroughs')
constituency_cross_section[constituency_name %in% south_wales,
    webb_county := "South Wales"]
constituency_cross_section[!(constituency_name %in% south_wales) & county_name == "Wales",
    webb_county := "North Wales"]
constituency_cross_section[webb_county == "South Wales", constituency_name]

# data on unions from Webb and Webb
df_webb_union <- fread("Data/Trade Unions/webb_unions_1892.csv")

df_webb_union <- df_webb_union[, .(webb_county, webb_unionists,
    webb_frac_unionists_1892 = webb_unionists / webb_pop_1891)]


constituency_cross_section <- merge(constituency_cross_section, df_webb_union,
    by = "webb_county", all.x = T)
fwrite(constituency_cross_section,
    "Data/Intermediate outputs/constituency_cross_section.csv")

# disregard: check that output matches original analysis datasets
#df_temp <- fread("Data/Intermediate outputs/constituency_cross_section.csv")
#df_temp2 <- fread("~/Dropbox (Personal)/Trade shocks 1900s/Data/#Clean for analysis/constituency_cross_section.csv")
#col_keep <- colnames(df_temp)
#df_temp2 <- df_temp2[, ..col_keep]

#all.equal(df_temp[order(constituency_id)], df_temp2[order(constituency_id)])

# now prep for clean replication files
df_shock <- fread("Data/Intermediate outputs/shocks_and_shares.csv")
df_cross_section <- fread(
    "Data/Intermediate outputs/constituency_cross_section.csv"
    )

df_cross_section[, `:=`(
    group_pop = sum(const_pop, na.rm = T)),
    by = .(constituency_group)]

df_cross_section[, const_frac_n_irish := const_frac_foreign - const_frac_irish]

df_ind_fracs <- unique(df_shock[, .(
    const_frac_steel = const_frac_category[category_uk == "sheet iron and steel"],
    const_frac_sugar = const_frac_category[category_uk == "refined sugar"],
    const_frac_zinc = const_frac_category[category_uk == "sheet zinc"],
    const_frac_cotton = const_frac_category[category_uk == "cotton manufactures"],
    const_frac_lace = const_frac_category[category_uk == "lace"]),
    by = .(constituency_id, year)][,
    .(constituency_id, const_frac_steel, const_frac_sugar, const_frac_zinc,
        const_frac_cotton, const_frac_lace)])

df_cross_section <- merge(df_cross_section, df_ind_fracs,
    by = "constituency_id",
    all.x = T)

df_panel <- fread("Data/Intermediate outputs/constituency_panel.csv")
df_panel[, lib_lab_share := lab_p_share + lib_share]

df_panel[, `:=`(ln_frac_secondary = log(frac_secondary),
        ln_frac_vagrant = log(frac_vagrant),
        ln_frac_unskilled = log(frac_unskilled))]
df_panel[year %in% c(1880, 1890, 1900, 1910),
    `:=`(frac_vagrant_lag = shift(frac_vagrant, 1),
        frac_unskilled_lag = shift(frac_unskilled, 1),
        frac_secondary_lag = shift(frac_secondary, 1),
        hiscam_lag = shift(hiscam, 1)),
    by = .(constituency_id) ]
df_panel[year %in% c(1890, 1900, 1910),
    `:=`(fd_ln_frac_vagrant = ln_frac_vagrant - log(frac_vagrant_lag),
        fd_hiscam = hiscam - hiscam_lag,
        fd_ln_frac_unskilled = ln_frac_unskilled - log(frac_unskilled_lag)
    )]


df_panel[year %in% c(1892, 1900, 1910),
    `:=`(fd_shock_elec10_w_lead = shift(fd_shock_elec10_w, -1)),
    by = .(constituency_id)]


df_shock[year %in% c(1892, 1900, 1910),
    `:=`(fd_percap_elec10_w_lead = shift(fd_percap_elec10_w, -1)),
    by = .(constituency_id, category_uk)]


# pca of industry shares
df_ind <- df_shock[year == 1880 & !is.na(constituency_id),
    .(constituency_id, category_uk, const_frac_category)]

df_ind <- dcast(df_ind, constituency_id ~ category_uk,
    value.var = "const_frac_category")

pca <- prcomp(df_ind[, 2:96], center = T)
summary(pca)
df_ind <- cbind(df_ind[, 1], pca$x[, 1:3])
df_cross_section <- merge(df_cross_section, df_ind, by = "constituency_id",
    all.x = T)

df <- merge(df_shock[!is.na(constituency_id)],
    df_cross_section[!is.na(constituency_id)],
    by = c("constituency_id", "g_unit", "constituency_name"),
    all.x = T, all.y = T)
df <- merge(df, df_panel[!is.na(constituency_id)],
    by = c("constituency_id", "g_unit", "constituency_name", "year",
        "county_name", "const_type"))
df_s <- merge(df_panel, df_cross_section,
    by = c("constituency_id", "g_unit", "constituency_name", "const_type",
        "county_name"),
    all.x = T, all.y = T)


df[, region_year := as.factor(paste(region, year))]

df[, `:=`("sec_1886" = const_frac_secondary * (year == 1886),
    "sec_1890" = const_frac_secondary * (year == 1890),
    "sec_1892" = const_frac_secondary * (year == 1892),
    "sec_1895" = const_frac_secondary * (year == 1895),
    "sec_1900" = const_frac_secondary * (year == 1900),
    "sec_1906" = const_frac_secondary * (year == 1906),
    "sec_1910" = const_frac_secondary * (year == 1910),
    "sec_1911" = const_frac_secondary * (year == 1911),
    n_irish_1886 = const_frac_n_irish * (year == 1886),
    n_irish_1890 = const_frac_n_irish * (year == 1890),
    n_irish_1892 = const_frac_n_irish * (year == 1892),
    n_irish_1895 = const_frac_n_irish * (year == 1895),
    n_irish_1900 = const_frac_n_irish * (year == 1900),
    n_irish_1906 = const_frac_n_irish * (year == 1906),
    n_irish_1910 = const_frac_n_irish * (year == 1910),
    n_irish_1911 = const_frac_n_irish * (year == 1911),
    steel_1886 = const_frac_steel * (year == 1886),
    steel_1890 = const_frac_steel * (year == 1890),
    steel_1892 = const_frac_steel * (year == 1892),
    steel_1895 = const_frac_steel * (year == 1895),
    steel_1900 = const_frac_steel * (year == 1900),
    steel_1906 = const_frac_steel * (year == 1906),
    steel_1910 = const_frac_steel * (year == 1910),
    steel_1911 = const_frac_steel * (year == 1911),
    zinc_1886 = const_frac_zinc * (year == 1886),
    zinc_1890 = const_frac_zinc * (year == 1890),
    zinc_1892 = const_frac_zinc * (year == 1892),
    zinc_1895 = const_frac_zinc * (year == 1895),
    zinc_1900 = const_frac_zinc * (year == 1900),
    zinc_1906 = const_frac_zinc * (year == 1906),
    zinc_1910 = const_frac_zinc * (year == 1910),
    zinc_1911 = const_frac_zinc * (year == 1911),
    sugar_1886 = const_frac_sugar * (year == 1886),
    sugar_1890 = const_frac_sugar * (year == 1890),
    sugar_1892 = const_frac_sugar * (year == 1892),
    sugar_1895 = const_frac_sugar * (year == 1895),
    sugar_1900 = const_frac_sugar * (year == 1900),
    sugar_1906 = const_frac_sugar * (year == 1906),
    sugar_1910 = const_frac_sugar * (year == 1910),
    sugar_1911 = const_frac_sugar * (year == 1911),
    cotton_1886 = const_frac_cotton * (year == 1886),
    cotton_1890 = const_frac_cotton * (year == 1890),
    cotton_1892 = const_frac_cotton * (year == 1892),
    cotton_1895 = const_frac_cotton * (year == 1895),
    cotton_1900 = const_frac_cotton * (year == 1900),
    cotton_1906 = const_frac_cotton * (year == 1906),
    cotton_1910 = const_frac_cotton * (year == 1910),
    cotton_1911 = const_frac_cotton * (year == 1911),
    lace_1886 = const_frac_lace * (year == 1886),
    lace_1890 = const_frac_lace * (year == 1890),
    lace_1892 = const_frac_lace * (year == 1892),
    lace_1895 = const_frac_lace * (year == 1895),
    lace_1900 = const_frac_lace * (year == 1900),
    lace_1906 = const_frac_lace * (year == 1906),
    lace_1910 = const_frac_lace * (year == 1910),
    lace_1911 = const_frac_lace * (year == 1911),
    unions_1900 = webb_frac_unionists_1892 * (year == 1900),
    unions_1906 = webb_frac_unionists_1892 * (year == 1906),
    unions_1910 = webb_frac_unionists_1892 * (year == 1910),
    unions_1911 = webb_frac_unionists_1892 * (year == 1911),
    pc1_1890 = PC1 * (year == 1890),
    pc1_1900 = PC1 * (year == 1900),
    pc1_1906 = PC1 * (year == 1906),
    pc1_1910 = PC1 * (year == 1910),
    pc2_1890 = PC2 * (year == 1890),
    pc2_1900 = PC2 * (year == 1900),
    pc2_1906 = PC2 * (year == 1906),
    pc2_1910 = PC2 * (year == 1910),
    pc3_1890 = PC3 * (year == 1890),
    pc3_1900 = PC3 * (year == 1900),
    pc3_1906 = PC3 * (year == 1906),
    pc3_1910 = PC3 * (year == 1910)
    )]






sec_controls <- c("sec_1886", "sec_1890", "sec_1892", "sec_1895", "sec_1900",
    "sec_1906", "sec_1910", "sec_1911")
n_irish_controls <- c("n_irish_1886", "n_irish_1892", "n_irish_1895",
    "n_irish_1900", "n_irish_1906", "n_irish_1910", "n_irish_1911")
union_controls <- c("unions_1900", "unions_1906", "unions_1910", "unions_1911")
years <- c(1886, 1890, 1892, 1895, 1900, 1906, 1910, 1911)
steel_controls <- paste0("steel_", years)
zinc_controls <- paste0("zinc_", years)
sugar_controls <- paste0("sugar_", years)
cotton_controls <- paste0("cotton_", years)
lace_controls <- paste0("lace_", years)


print(sort(colnames(df)))
df[, group_county := county_name]
df[constituency_group == "London", group_county := "London"]


df_group <- df[, .(
    shock_diff_w = weighted.mean(shock_diff_w, const_pop, na.rm = T),
    shock_diff00w = weighted.mean(shock_diff00w, const_pop, na.rm = T),
    diff_percap_w = weighted.mean(diff_percap_w, const_pop, na.rm = T),
    diff_percap00w = weighted.mean(diff_percap00w, const_pop, na.rm = T),
    frac_vagrant = weighted.mean(frac_vagrant, const_pop, na.rm = T),
    const_frac_category = weighted.mean(const_frac_category, const_pop, na.rm = T),
    const_frac_secondary = weighted.mean(const_frac_secondary, const_pop, na.rm = T),
    const_frac_n_irish = weighted.mean(const_frac_n_irish, const_pop, na.rm = T)),
    by = .(constituency_group, county_name = group_county, group_pop, category_uk, year)]
df_group[, `:=`(sec_1886 = const_frac_secondary * (year == 1886),
    sec_1892 = const_frac_secondary * (year == 1892),
    sec_1895 = const_frac_secondary * (year == 1895),
    sec_1900 = const_frac_secondary * (year == 1900),
    sec_1906 = const_frac_secondary * (year == 1906),
    sec_1910 = const_frac_secondary * (year == 1910),
    sec_1911 = const_frac_secondary * (year == 1911),
    n_irish_1886 = const_frac_n_irish * (year == 1886),
    n_irish_1892 = const_frac_n_irish * (year == 1892),
    n_irish_1895 = const_frac_n_irish * (year == 1895),
    n_irish_1900 = const_frac_n_irish * (year == 1900),
    n_irish_1906 = const_frac_n_irish * (year == 1906),
    n_irish_1910 = const_frac_n_irish * (year == 1910),
    n_irish_1911 = const_frac_n_irish * (year == 1911))]

getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

df_group[, `:=`(
    diff_percap_w = getmode(diff_percap_w),
    diff_percap00w = getmode(diff_percap00w)),
    by = .(category_uk, year)]


df_group_s <- unique(df_group[, .(county_name, group_pop,
    constituency_group, year, const_frac_secondary, frac_vagrant,
    const_frac_n_irish,  shock_diff_w, shock_diff00w)])

group_lookup <- df_cross_section[, .(g_unit, constituency_group)]


df_news <- fread("Data/Newspapers/BNA_hits_2021_02_20.csv")



df_news[, term := str_replace_all(term, "\"[\"]*", "\"")]

df_news <- df_news[!is.na(hits) & !is.na(g_unit)]
df_news[, title := value]
df_news[, title_full := text]
df_news[, c("start_year", "end_year", "value", "text") := NULL]
df_news[, per_issue := hits / issues]
head(df_news)
group_lookup <- df_cross_section[, .(g_unit, constituency_group)]
df_news <- merge(df_news, group_lookup, by = "g_unit", all.x = T)



df_vars <- read_excel("codebook.xlsx", sheet = "variables")
head(df_vars)
vars <- unique(df_vars$variable_name)

drop_shares <- c("diff_percap",
    "export_diff_percap_w", "export_diff_percap00w", "region_year",
    "group_county" )





drop_short <- c('shock_diff', 'lab_p_share', 'cons_share_elec10_lag',
    'cons_share_1900', 'lab_share_lag', 'lib_share_lag', 'cons_share_lag',
    'unopposed_lag', 'fd_lab', 'fd_lib', 'fd_cons', 'ln_frac_secondary',
    'ln_frac_vagrant', 'ln_frac_unskilled', 'const_frac_foreign',
    'const_frac_irish', 'const_frac_wheat', 'region', 'webb_unionists',
    'group_pop')


df_cols_keep <- colnames(df)[!(colnames(df) %in% drop_short) &
    !(colnames(df) %in% c(drop_shares, drop_shares))]

df_dropped <- df[, ..df_cols_keep]
df_s_cols_keep <- colnames(df_s)[!(colnames(df_s) %in% drop_short)]
df_s_cols_keep
df_s_dropped <- df_s[, ..df_s_cols_keep]

group_s_vars <- c("constituency_group", "year", "shock_diff_w", "county_name",
    "const_frac_secondary", "group_pop", "shock_diff00w", "const_frac_n_irish")
group_vars <- c(group_s_vars, "category_uk", "const_frac_category",
    "diff_percap_w", "diff_percap00w", "sec_1886", "sec_1892", "sec_1895",
    "sec_1900", "sec_1906", "sec_1910", "sec_1911",
    "n_irish_1886", "n_irish_1892", "n_irish_1895", "n_irish_1900",
    "n_irish_1906", "n_irish_1910", "n_irish_1911")

df_group_s_dropped <- df_group_s[, ..group_s_vars]
df_group_dropped <- df_group[, ..group_vars]

fwrite(df_dropped, "Data/_Final datasets/share_level_data.csv")
fwrite(df_s_dropped, "Data/_Final datasets/constituency_level_data.csv")
fwrite(df_group_dropped, "Data/_Final datasets/share_level_data_grouped.csv")
fwrite(df_group_s_dropped, "Data/_Final datasets/constituency_level_data_grouped.csv")
fwrite(df_news, "Data/_Final datasets/newspaper_data.csv")

# code to prep Bronner-Ziblatt data (note: data is not publicly available)
#df_manifestos <- fread("Data/Manifestos/all_elections_speeches_census.csv")
#colnames(df_manifestos)
#df_manifestos <- df_manifestos[, .(election_id, name_on_return, member_id,
#    party, incumbent, text, ntoken, vote_share, election.yr)]

#df_manifestos[, l_text := tolower(str_trim(text))]
#head(df_manifestos)
#get_manif_counts <- function(word, df_manif_temp) {
#    df_manif_temp[, term_count := str_count(l_text, word)]
#    df_manif_temp[, term := word]
#    return(df_manif_temp[, .(election_id, name_on_return, member_id, party, ntoken,
#        election.yr, term, term_count)])
#}


#df_manif_out <- lapply(c("immigra[un]t", "alien", "jew", "foreigner",
#    "alien|immigra[un]t|jew|foreigner", "social reform", "poor law", "labour exchange",
#    "social reform|poor law|labour exchange", "germany",
#    "germany|teuton|prussia|fatherland|kaiser",
#    "naval|navy|battleship|dreadnought|fleet",
#    "national service league|navy league"), function(x) get_manif_counts(x, df_manifestos)) %>%
#    rbindlist()
#tail(df_manif_out)

#fwrite(df_manif_out, "Data/_Final datasets/manifesto_data.csv")


#%% data for figures
df_total_imports <- read_excel("Data/Trade statistics/Statement of trade.xlsx",
    sheet = "Germany Imports Timeseries") %>% data.table()

df_total_imports[, imports := (imports_from_germany + imports_from_netherlands +
    imports_from_belgium) / 1000000]
fwrite(df_total_imports[, .(year, imports_from_germany, imports_from_netherlands,
    imports_from_belgium, imports_combined = imports)],
    file = "Data/_Data for figures/figure_1.csv")

df_total_imports_1910 <- transpose(df_total_imports[year == 1910, 2:7],
    )

df_total_imports_1910$variable <- colnames(df_total_imports)[2:7]
df_total_imports_1910$country <- rep(c("Germany", "Netherlands", "Belgium"), 2)
df_total_imports_1910$type <- rep(c("Port", "Origin"), each = 3)
df_total_imports_1910[, imports := V1 / 1000000]
df_total_imports_1910[, type := factor(type,levels = c("Port", "Origin"))]
df_total_imports_1910[, country := factor(country,levels = c("Germany", "Belgium", "Netherlands"))]
df_total_imports_1910[, V1 := NULL]

fwrite(df_total_imports_1910, "Data/_Data for figures/figure_a1.csv")

df_import_cat <- fread("Data/Intermediate outputs/import_year_data.csv")
head(df_import_cat)
df_import_cat <- df_import_cat[, .(category_uk, year, value_tot)]
fwrite(df_import_cat, "Data/_Data for figures/figure_2.csv")

df_welfare <- read_excel("Data/Government spending/Boyer welfare statistics.xlsx",
    sheet = "series") %>% data.table()
head(df_welfare)
fwrite(df_welfare, "Data/_Data for figures/figure_3.csv")

df_times_words <- fread("Data/Times ngrams/NGramData_unemp_times.csv")
head(df_times_words)
colnames(df_times_words) <- c("term", "year", "references")
df_times_words[, term := str_replace_all(term, "\"\"", "\"")]
fwrite(df_times_words, "Data/_Data for figures/figure_6.csv")


df_trad_hist <- read_excel("Data/TRADHIST/TRADHIST_BITRADE_BITARIFF_1.xlsx") %>%
    data.table()
head(df_trad_hist)
df_trad_hist <- df_trad_hist[between(year, 1880, 1910) & (iso_o %in% c("DEU", "BEL", "NLD") |
    iso_d %in% c("GBR"))]
head(df_trad_hist)
fwrite(df_trad_hist, "Data/_Data for figures/figure_a2.csv")
